
from bs4 import BeautifulSoup       # 网页解析，获取数据
import re       # 正则表达式 - 字符串匹配
import urllib       # 指定url，获取网页数据
import xlwt         # 进行excel写入操作
import sqlite3      # 进行sqlite数据库操作
import urllib.request
# 获取post的请求
import urllib.parse


def main():
    baseurl = r"好大夫平台上的医院链接  + /menzhen_"
    # 爬取网页
    # 解析数据
    datalist = getData(baseurl)
    # askURL(baseurl)


    # 保存数据到excel
    savepath = r"保存地址绝对路径.xls"
    saveData(datalist, savepath)

    # 保存数据到数据库
    #savePath = r"同上.db"
    #saveData2DB(datalist, savePath)
    print("数据保存成功，爬取结束.....")

findDoctorWeb = re.compile(r'<a class="name" href="(.*?)"') # 医生个人网站
findIntroduction = re.compile(r'<p class="init-content">(.*?)</p>',re.S) # 医生简介
findGoodAt = re.compile(r'<p class="content">(.*?)</p>',re.S)  # 医生擅长
findDocHosp = re.compile(r'<a href="(.*?)">(.*?)</a>',re.S) # 医生医院及科室
findDocDep = re.compile(r'<a href="(.*?)">(.*?)</a>',re.S)
findDocTitle = re.compile(r'<span class="doctor-title">(.*?)</span>')    # 职称
findRecom = re.compile(r'<span class="value">(.*?)</span>')         # 推荐热度
findConsult = re.compile(r'<span class="value js-consult-count">(.*?)</span>')  # 在线问诊量
findSch = re.compile(r'<p class="schedule-date">(.*?)</p>') #出诊
findsatify = re.compile(r'<i class="sta-num">(\d+\.?\d*)<span class="percent">')     #满意度

# 爬取网页函数
def getData(baseurl):
    print("爬取中.....")
    
    datalist = []
    for i in range(0,2):
        # 获取html信息
        html = askURL(baseurl+str(i+1)+'.htm')  
        soup = BeautifulSoup(html, "html.parser")

        # 解析
        for item in soup.find_all('a', class_="name"): # 爬取每个医生网站链接
            # print(item)
            item = str(item) # 医生个人网址链接标签
            docURL = re.findall(findDoctorWeb, item)
            # print(docURL)

            for doctorurl in docURL:
                data = []
                dochtml = askURL(doctorurl)
                if(type(dochtml) == type(' ')):
                    continue
                dochtml = dochtml.decode('utf-8')
                doctorHtml = str(dochtml)  # 医生网页信息
                #print(doctorHtml)
                # return
                
                # id
                docId = re.findall(r'doctorId: "(\d+\.?\d*)"', doctorHtml)[0]
                data.append(docId)
                
                # 医生简介及性别
                docInfo = re.findall(findIntroduction, doctorHtml)
                if len(docInfo) == 0:
                    docInfo='null'
                else:
                    docInfo = str.strip(docInfo[0])
                    docInfo = docInfo.replace('<br>', '')
                data.append(docInfo)
                
                docGender = re.findall('，(男|女)，', docInfo)
                if len(docGender) == 0:
                    docGender='null'
                else:
                    docGender=docGender[0]
                data.append(docGender)
                
                # 医生擅长
                docGA = re.findall(findGoodAt, doctorHtml)
                if len(docGA) == 0:
                    docGA = 'null'
                else:
                    docGA = str.strip(docGA[0])
                data.append(docGA)
                
                # 所在医院及所在科室
                docFacultySoup = BeautifulSoup(dochtml, "html.parser")
                hops = []
                department = []
                for faculty in docFacultySoup.find_all('li', class_="doctor-faculty"):
                    hops.append(re.findall(findDocHosp, str(faculty))[0][1])
                    department.append(re.findall(findDocDep, str(faculty))[1][1])
                data.append(str(set(hops)))
                data.append(str(set(department)))
                
                # 医院级别  按医院爬取医生信息
                data.append('二甲')
                # 医院所在省市
                data.append('广东省')
                data.append('广州')
                
                # 医生职称
                docTitle = re.findall(findDocTitle, doctorHtml)
                if len(docTitle) == 0:
                    docTitle = 'null'
                else:
                    docTitle = docTitle[0]
                data.append(docTitle)
                
                
                # 在线服务满意度.................疗效满意度
                docsatify = re.findall(findsatify, doctorHtml)
                if len(docsatify) == 0:
                    docsatify = 'null'
                else:
                    docsatify = docsatify[0]
                data.append(docsatify) 
                
                
                # 综合推荐热度
                docRecom = re.findall(findRecom, doctorHtml)
                if len(docRecom) == 0:
                    docRecom = 'null'
                else:
                    docRecom = docRecom[0]
                data.append(docRecom)

                
                # 个人网址数据统计
                # 访问量、文章数、诊后报道患者数、患者投票、感谢信、心意礼物、上次在线、开通时间
                spaceId = re.findall(r'spaceId: "(.*?)",', doctorHtml)[0]
                
                # 在线问诊量
                temp = askURL('https://www.haodf.com/ndoctor/ajaxGetConsutNum?spaceId='+spaceId)
                if(type(temp)==type(' ')):
                    continue
                temp = str(temp.decode('utf-8'))
                temp = re.findall(r'"total":(\d+\.?\d*)',temp)
                if len(temp) == 0:
                    data.append(0)
                else:
                    data.append(temp[0])
                
                # 总访问
                temp = askURL('https://www.haodf.com/ndoctor/ajaxGetTotalPV?id='+spaceId+'&type=space')
                temp = str(temp.decode('utf-8'))
                temp = re.findall(r'"data":(\d+\.?\d*)',temp)
                if len(temp) == 0:
                    data.append(0)
                else:
                    data.append(temp[0])
                
                # 文章数.......
                data.append(0)
                
                # 诊后报道患者数 
                temp = askURL('https://www.haodf.com/ndoctor/ajaxGetDoctorOtherData?startTime=2021-07-07+00%3A00%3A00&endTime=2021-07-07+23%3A59%3A59&spaceId='+spaceId)
                temp = str(temp.decode('utf-8'))
                temp = re.findall(r'(\d+\.?\d*)',temp)
                if len(temp) < 3:
                    data.append(0)
                else:
                    data.append(temp[3])
                
                # 患者投票js动态数据
                temp = askURL('https://www.haodf.com/ndoctor/ajaxGetDoctorData?spaceId='+spaceId)
                temp = str(temp.decode('utf-8'))
                findList = re.findall(r'"doctorVoteCnt":"(\d+\.?\d*)"',temp)
                if len(findList) == 0:
                    data.append('null')
                else:
                    data.append(findList[0])
                    
                    
                # 感谢信
                findList = re.findall(r'"thankLetterCount":"(\d+\.?\d*)"',temp)
                if len(findList) == 0:
                    data.append('null')
                else:
                    data.append(findList[0])
                
                # 上次在线、开通时间
                findList = re.findall(r'"spaceActiveDate":"(.*?)"',temp)
                if len(findList) == 0:
                    data.append('null')
                else:
                    data.append(findList[0])
                findList = re.findall(r'"openSpaceTime":"(.*?)"', temp)
                if len(findList) == 0:
                    data.append('null')
                else:
                    data.append(re.findall(r'"openSpaceTime":"(.*?)"', temp)[0])
                
                # 心意礼物
                temp = askURL('https://www.haodf.com/ndoctor/ajaxGetCntOfPresent?spaceId='+spaceId)
                if len(temp) == 0:
                    temp = ' '
                else:
                    temp = str(temp.decode('utf-8'))
                findList = re.findall(r'"data":(\d+\.?\d*)',temp)
                if len(findList) == 0:
                    data.append('null')
                else:
                    data.append(findList[0])

                # 出停诊信息
                docSch = []
                for sch in re.findall(findSch, doctorHtml):
                    docSch.append(str.split(sch,' ')[1])
                data.append(str(set(docSch)))
                datalist.append(data)
                #print(data)
                # print('saving...')
                #return datalist

        
    # for i in datalist:
    #     print(i)
    print("爬取完成.....")
    return datalist


# 得到指定的url 的网页内容
def askURL(url):
    header = {
        "User-Agent": r""  ## 填写自己浏览器的请求头信息!!!!!!，防止爬虫检测！！！！重要
                      r""
    }

 
    req = urllib.request.Request(url=url, headers=header)
    html = ""
    try:
        response = urllib.request.urlopen(req)
        html = response.read()

    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)

    return html



# 保存数据函数
def saveData(datalist, savepath):
    print("saving.......")
    book = xlwt.Workbook(encoding="utf-8", style_compression=0)  # 创建workbook对象
    sheet = book.add_sheet('sheet1',cell_overwrite_ok=True)  # 创建工作表
    # worksheet.write(0,0,"hello")        # 写入数据， 参数为二维数组坐标及内容
    row = ("id", "简介", "性别", "擅长", "医院", "诊室", "医院级别", "所在省","所在市",
           "医生职称","在线服务满意度", "综合推荐热度", "在线问诊量", "总访问", "文章数",
           "诊后报道患者数 ", "患者投票","感谢信", "上次在线", "开通时间","心意礼物",
           "出停诊信息")

    # 第一行
    for i in range(len(row)):
        sheet.write(0, i, row[i])

    for i in range(0, len(datalist)):
        data = datalist[i]
        for j in range(0, len(datalist[i])):
            sheet.write(i+1, j, data[j])

    book.save(savepath)

# 保存数据至数据库
def initDB(savepath):
    conn = sqlite3.connect(savepath)
    cursor = conn.cursor()

    sql = '''create table gdszy(
    id int primary key, 简介 text, 性别 text, 擅长 text, 医院 text, 诊室 text,
    医院级别 text, 所在省 text,所在市 text, 医生职称 text,
    在线服务满意度 text, 综合推荐热度 text, 在线问诊量 text, 总访问text,
    文章数 text, 诊后报道患者数  text, 患者投票 text,感谢信 text, 
    上次在线 text, 开通时间 text,心意礼物 text, 出停诊信息 text
    )'''  # sql创建表
    cursor.execute(sql)
    conn.commit()
    conn.close()

def saveData2DB(datalist, savepath):
    print("保存数据.....")
    #initDB(savepath)
    conn = sqlite3.connect(savepath)
    cursor = conn.cursor()

    for data in datalist:
        for index in range(len(data)):
            if index == 0:
                continue
            data[index] = '"' + str(data[index]) + '"' 
        sql = '''
            insert into gdszy values (%s)
        '''%(','.join(data))
        # print(sql)
        cursor.execute(sql)

    conn.commit()
    conn.close()

if __name__ == "__main__":
    main()
